This page contains a lot of datatables and may take a minute to load (15.4 MB)
Please see the paper, vignette, manual, and vst notebook for much greater detail on specific aspects of DESeq2.
reqpkg <- c("DESeq2","dplyr","magrittr","DT","vsn","pheatmap","RColorBrewer","ggplot2","ggpubr")
for (i in reqpkg) {
print(i)
print(packageVersion(i))
suppressWarnings(suppressMessages(library(i, quietly=T, verbose=T, warn.conflicts=F,character.only=T)))
}
## [1] "DESeq2"
## [1] '1.26.0'
## [1] "dplyr"
## [1] '0.8.5'
## [1] "magrittr"
## [1] '1.5'
## [1] "DT"
## [1] '0.13'
## [1] "vsn"
## [1] '3.54.0'
## [1] "pheatmap"
## [1] '1.0.12'
## [1] "RColorBrewer"
## [1] '1.1.2'
## [1] "ggplot2"
## [1] '3.3.0'
## [1] "ggpubr"
## [1] '0.3.0'
theme_set(theme_pubr())
df <- read.csv2("table_dada2_mfilt_p6364_feature10_count.tsv", sep="\t")
metadata <- read.csv2("48hr_metadata.csv", sep=",")
# pulling sample numbers from column names of df. Accomplished by repeated string splits.
metadata$SampleNo <- strsplit(colnames(df[,-1]), "no") %>% sapply("[", 2) %>% strsplit("zt\\d") %>% sapply("[[", 1) %>% gsub(".{1}$", "", .) %>% as.numeric()
metadata %>% datatable()
Some functions to extract and display a sample from assay data.
dtable <- function(d, n) {
d[, c(1, which(metadata$SampleNo == n)+1)] %>%
datatable(filter="bottom")
}
dtable2 <- function(d, n) {
d[, which(metadata$SampleNo == n)] %>%
datatable(filter="bottom")
}
Taking a look at the data via datatables.
dtable(df,1)
dtable(df,3)
## no9
dtable(df,9)
dtable(df,10)
dtable(df,15)
dtable(df,16)
dtable(df,17)
dtable(df,18)
dtable(df,19)
dtable(df,58)
dtable(df,60)
dtable(df,61)
dtable(df,63)
dtable(df,65)
dtable(df,66)
dtable(df,67)
dtable(df,69)
dtable(df,70)
dtable(df,71)
dtable(df,72)
dtable(df,75)
dtable(df,120)
dtable(df,121)
dtable(df,122)
dtable(df,125)
dtable(df,126)
dtable(df,127)
dtable(df,131)
dtable(df,136)
dtable(df,151)
dtable(df,153)
df %<>%
set_rownames(.$X.OTU.ID) %>%
select(-X.OTU.ID)
condition_list <- data.frame(row.names = colnames(df),
Gender = relevel(metadata$Gender, "M"),
Genotype = relevel(metadata$Genotype, "WT"),
Time = factor(metadata$Time)
)
dds <- DESeq2::DESeqDataSetFromMatrix(countData = df,
colData = condition_list,
design = ~ Gender + Genotype + Gender:Genotype
)
dds <- estimateSizeFactors(dds)
# Creating DESeqTransform object from raw data to compare with transformations downstream
dds_dud <- SummarizedExperiment(counts(dds), colData = colData(dds)) %>%
DESeqTransform()
I’ll attempt 4 types of normalization by DESeq2
Of these, all besides log10 scaling are normalizations that consider library size.
This is \(\log_2 (n + 1)\), where \(n\) is normalized to estimated size factors. However, since counts data tends to prioritize available normalization factors before defaulting to estimated size factors, I’ll first check the available normalization factors and estimated size factors in the DESeqData object.
The result of this function are psuedocounts, basically just scaled counts.
normalizationFactors(dds)
## NULL
sizeFactors(dds) %>% head()
## no1.zt14 no1.zt2 no1.zt20 no1.zt26 no1.zt32 no1.zt38
## 10.6246608 0.5244896 3.3367529 2.7193537 9.2220258 2.4875793
ntd <- normTransform(dds)
ntd_assay <- assay(ntd)
colData(ntd)
## DataFrame with 245 rows and 4 columns
## Gender Genotype Time sizeFactor
## <factor> <factor> <factor> <numeric>
## no1.zt14 M WT 14 10.6246608037157
## no1.zt2 M WT 2 0.524489602440125
## no1.zt20 M WT 20 3.33675289933337
## no1.zt26 M WT 26 2.71935371017527
## no1.zt32 M WT 32 9.22202575261865
## ... ... ... ... ...
## no9.zt26 M KO 26 0.598417660688828
## no9.zt32 M KO 32 1.42561269082297
## no9.zt38 M KO 38 2.20485438587687
## no9.zt44 M KO 44 0.892131838055297
## no9.zt8 M KO 8 1.27675754651139
Below are tables for every sample’s ntd output.
dtable2(ntd_assay, 1)
dtable2(ntd_assay,3)
### no9
dtable2(ntd_assay,9)
dtable2(ntd_assay,10)
dtable2(ntd_assay,15)
dtable2(ntd_assay,16)
dtable2(ntd_assay,17)
dtable2(ntd_assay,18)
dtable2(ntd_assay,19)
dtable2(ntd_assay,58)
dtable2(ntd_assay,60)
dtable2(ntd_assay,61)
dtable2(ntd_assay,63)
dtable2(ntd_assay,65)
dtable2(ntd_assay,66)
dtable2(ntd_assay,67)
dtable2(ntd_assay,69)
dtable2(ntd_assay,70)
dtable2(ntd_assay,71)
dtable2(ntd_assay,72)
dtable2(ntd_assay,75)
dtable2(ntd_assay,120)
dtable2(ntd_assay,121)
dtable2(ntd_assay,122)
dtable2(ntd_assay,125)
dtable2(ntd_assay,126)
dtable2(ntd_assay,127)
dtable2(ntd_assay,131)
dtable2(ntd_assay,136)
dtable2(ntd_assay,151)
dtable2(ntd_assay,153)
Conceptually, vst transforms data to make it homoscedastic, that is the variance across ranks (i.e each sample) should be stabilized.
DESeq2 requires > 10000 items/sample to run vst, the subset and faster version of variance stabilization. We need to use varianceStabilizingTransformation because there are orders of magnitude less items/sample in 16S data.
vsd <- varianceStabilizingTransformation(dds)
vsd_assay <- assay(vsd)
colData(vsd)
## DataFrame with 245 rows and 4 columns
## Gender Genotype Time sizeFactor
## <factor> <factor> <factor> <numeric>
## no1.zt14 M WT 14 10.6246608037157
## no1.zt2 M WT 2 0.524489602440125
## no1.zt20 M WT 20 3.33675289933337
## no1.zt26 M WT 26 2.71935371017527
## no1.zt32 M WT 32 9.22202575261865
## ... ... ... ... ...
## no9.zt26 M KO 26 0.598417660688828
## no9.zt32 M KO 32 1.42561269082297
## no9.zt38 M KO 38 2.20485438587687
## no9.zt44 M KO 44 0.892131838055297
## no9.zt8 M KO 8 1.27675754651139
Below are tables for every sample’s vst output.
dtable2(vsd_assay, 1)
dtable2(vsd_assay,3)
### no9
dtable2(vsd_assay,9)
dtable2(vsd_assay,10)
dtable2(vsd_assay,15)
dtable2(vsd_assay,16)
dtable2(vsd_assay,17)
dtable2(vsd_assay,18)
dtable2(vsd_assay,19)
dtable2(vsd_assay,58)
dtable2(vsd_assay,60)
dtable2(vsd_assay,61)
dtable2(vsd_assay,63)
dtable2(vsd_assay,65)
dtable2(vsd_assay,66)
dtable2(vsd_assay,67)
dtable2(vsd_assay,69)
dtable2(vsd_assay,70)
dtable2(vsd_assay,71)
dtable2(vsd_assay,72)
dtable2(vsd_assay,75)
dtable2(vsd_assay,120)
dtable2(vsd_assay,121)
dtable2(vsd_assay,122)
dtable2(vsd_assay,125)
dtable2(vsd_assay,126)
dtable2(vsd_assay,127)
dtable2(vsd_assay,131)
dtable2(vsd_assay,136)
dtable2(vsd_assay,151)
dtable2(vsd_assay,153)
This is similar to \(\log_2 (n + n_0)\), where \(n\) is the counts and \(n_0\) is a positive constant. Here, \(n_0\) is varied based on the dispersion-mean trend, in the equation (taken from the DESeq2 vignette, refer to vignette for more details) \(\log_2 (q_{ij} = \beta_{i0} + \beta_{ij})\).
rld <- rlog(dds)
## rlog() may take a long time with 50 or more samples,
## vst() is a much faster transformation
rld_assay <- assay(rld)
colData(rld)
## DataFrame with 245 rows and 4 columns
## Gender Genotype Time sizeFactor
## <factor> <factor> <factor> <numeric>
## no1.zt14 M WT 14 10.6246608037157
## no1.zt2 M WT 2 0.524489602440125
## no1.zt20 M WT 20 3.33675289933337
## no1.zt26 M WT 26 2.71935371017527
## no1.zt32 M WT 32 9.22202575261865
## ... ... ... ... ...
## no9.zt26 M KO 26 0.598417660688828
## no9.zt32 M KO 32 1.42561269082297
## no9.zt38 M KO 38 2.20485438587687
## no9.zt44 M KO 44 0.892131838055297
## no9.zt8 M KO 8 1.27675754651139
Below are tables for every sample’s rlog output.
dtable2(rld_assay, 1)
dtable2(rld_assay,3)
### no9
dtable2(rld_assay,9)
dtable2(rld_assay,10)
dtable2(rld_assay,15)
dtable2(rld_assay,16)
dtable2(rld_assay,17)
dtable2(rld_assay,18)
dtable2(rld_assay,19)
dtable2(rld_assay,58)
dtable2(rld_assay,60)
dtable2(rld_assay,61)
dtable2(rld_assay,63)
dtable2(rld_assay,65)
dtable2(rld_assay,66)
dtable2(rld_assay,67)
dtable2(rld_assay,69)
dtable2(rld_assay,70)
dtable2(rld_assay,71)
dtable2(rld_assay,72)
dtable2(rld_assay,75)
dtable2(rld_assay,120)
dtable2(rld_assay,121)
dtable2(rld_assay,122)
dtable2(rld_assay,125)
dtable2(rld_assay,126)
dtable2(rld_assay,127)
dtable2(rld_assay,131)
dtable2(rld_assay,136)
dtable2(rld_assay,151)
dtable2(rld_assay,153)
I’ll create a custom DESeqTransform object from log-transformed counts.
esf <- SummarizedExperiment(log(counts(dds)+1), colData = colData(dds)) %>%
DESeqTransform()
esf_assay <- assay(esf)
colData(esf)
## DataFrame with 245 rows and 4 columns
## Gender Genotype Time sizeFactor
## <factor> <factor> <factor> <numeric>
## no1.zt14 M WT 14 10.6246608037157
## no1.zt2 M WT 2 0.524489602440125
## no1.zt20 M WT 20 3.33675289933337
## no1.zt26 M WT 26 2.71935371017527
## no1.zt32 M WT 32 9.22202575261865
## ... ... ... ... ...
## no9.zt26 M KO 26 0.598417660688828
## no9.zt32 M KO 32 1.42561269082297
## no9.zt38 M KO 38 2.20485438587687
## no9.zt44 M KO 44 0.892131838055297
## no9.zt8 M KO 8 1.27675754651139
Below are tables for every sample’s \(log_{10}\) output.
dtable2(esf_assay, 1)
dtable2(esf_assay,3)
### no9
dtable2(esf_assay,9)
dtable2(esf_assay,10)
dtable2(esf_assay,15)
dtable2(esf_assay,16)
dtable2(esf_assay,17)
dtable2(esf_assay,18)
dtable2(esf_assay,19)
dtable2(esf_assay,58)
dtable2(esf_assay,60)
dtable2(esf_assay,61)
dtable2(esf_assay,63)
dtable2(esf_assay,65)
dtable2(esf_assay,66)
dtable2(esf_assay,67)
dtable2(esf_assay,69)
dtable2(esf_assay,70)
dtable2(esf_assay,71)
dtable2(esf_assay,72)
dtable2(esf_assay,75)
dtable2(esf_assay,120)
dtable2(esf_assay,121)
dtable2(esf_assay,122)
dtable2(esf_assay,125)
dtable2(esf_assay,126)
dtable2(esf_assay,127)
dtable2(esf_assay,131)
dtable2(esf_assay,136)
dtable2(esf_assay,151)
dtable2(esf_assay,153)
These graphs should help to understand how each transformation is affecting variance in the dataset. Standard deviation is graphed against means. Ideally, variance would be constant for different means and should result in a flat curve. Generally, variance shouldn’t be dependent on the mean.
p0 <- dds_dud %>% assay() %>% meanSdPlot()
p1 <- ntd_assay %>% meanSdPlot()
p2 <- vsd_assay %>% meanSdPlot()
p3 <- rld_assay %>% meanSdPlot()
p4 <- esf_assay %>% meanSdPlot()
Taking top 20 counts/estimatedSizeFactors, and generating a pheatmap annotation
select <- order(rowMeans(counts(dds,normalized=TRUE)),
decreasing=TRUE)[1:20]
anno <- as.data.frame(colData(dds)[,c("Gender","Genotype","Time")])
pheatmap(assay(dds_dud)[select,], cluster_rows=FALSE, show_colnames = FALSE, show_rownames=FALSE,
cluster_cols=FALSE, annotation_col=anno)
pheatmap(ntd_assay[select,], cluster_rows=FALSE, show_colnames = FALSE, show_rownames=FALSE,
cluster_cols=FALSE, annotation_col=anno)
pheatmap(vsd_assay[select,], cluster_rows=FALSE, show_colnames = FALSE, show_rownames=FALSE,
cluster_cols=FALSE, annotation_col=anno)
pheatmap(rld_assay[select,], cluster_rows=FALSE, show_colnames = FALSE, show_rownames=FALSE,
cluster_cols=FALSE, annotation_col=anno)
pheatmap(esf_assay[select,], cluster_rows=FALSE, show_colnames = FALSE, show_rownames=FALSE,
cluster_cols=FALSE, annotation_col=anno)
Procuring a distance matric. It portrays how similar samples are to each others. Darker blue = more similar.
sampleDists <- dist(t(assay(dds_dud)))
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- paste(dds_dud$Gender, dds_dud$Genotype, sep="-")
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
clustering_distance_rows=sampleDists,
clustering_distance_cols=sampleDists,
col=colors,
show_rownames = F)
sampleDists <- dist(t(ntd_assay))
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- paste(ntd$Gender, ntd$Genotype, sep="-")
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
clustering_distance_rows=sampleDists,
clustering_distance_cols=sampleDists,
col=colors,
show_rownames = F)
sampleDists <- dist(t(vsd_assay))
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- paste(vsd$Gender, vsd$Genotype, sep="-")
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
clustering_distance_rows=sampleDists,
clustering_distance_cols=sampleDists,
col=colors,
show_rownames = F)
sampleDists <- dist(t(rld_assay))
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- paste(rld$Gender, rld$Genotype, sep="-")
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
clustering_distance_rows=sampleDists,
clustering_distance_cols=sampleDists,
col=colors,
show_rownames = F)
sampleDists <- dist(t(esf_assay))
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- paste(esf$Gender, esf$Genotype, sep="-")
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
clustering_distance_rows=sampleDists,
clustering_distance_cols=sampleDists,
col=colors,
show_rownames = F)
plotPCA(dds_dud, intgroup=c("Gender", "Genotype"))
plotPCA(dds_dud, intgroup=c("Time"))
plotPCA(ntd, intgroup=c("Gender", "Genotype"))
plotPCA(ntd, intgroup=c("Time"))
plotPCA(vsd, intgroup=c("Gender", "Genotype"))
plotPCA(vsd, intgroup=c("Time"))
plotPCA(rld, intgroup=c("Gender", "Genotype"))
plotPCA(rld, intgroup=c("Time"))
plotPCA(esf, intgroup=c("Gender", "Genotype"))
plotPCA(esf, intgroup=c("Time"))
In terms of homoscedasticity, vst leads to the most normalized variance between samples. It also does a better job in reducing sample distance than log transformation alone. rlog had less homoscedasticity compared to vst, but it generated outliers through normalization - it is unclear whether those outliers were generated or all other values were shrunken, but given rlog’s method it is most likely that all other values were shrunk. Thus, while sample distances were rendered extremely small, it led to heavy outliers that make it less favorable.
Log10 transformation alone and ntd were pretty similar, so normalizing to estimated size factors did not lead to any notable difference. Both had more variance relative to the means compared to vst. Distribution was also fine in PCA - no obvious outliers.
Gratifyingly, all techniques used resulted in the same clustering by pheatmap as the raw counts data, implying that trends in the data were preserved through all transformations.
In the end, I would rank the normalization techniques, from best normalization of this data to worst with respect to resulting distribution, homoscedasticity, and sample distance, as \(vst > ntd \approx log10 >> rlog\).
Saving vst data to csv file.
write.csv(vsd_assay, "table_dada2_mfilt_p6364_feature10_count_vstnormalized.csv")
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Scientific Linux 7.4 (Nitrogen)
##
## Matrix products: default
## BLAS/LAPACK: /software/openblas-0.2.19-el7-x86_64/lib/libopenblas_haswellp-r0.2.19.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] ggpubr_0.3.0 ggplot2_3.3.0
## [3] RColorBrewer_1.1-2 pheatmap_1.0.12
## [5] vsn_3.54.0 DT_0.13
## [7] magrittr_1.5 dplyr_0.8.5
## [9] DESeq2_1.26.0 SummarizedExperiment_1.16.1
## [11] DelayedArray_0.12.3 BiocParallel_1.20.1
## [13] matrixStats_0.56.0 Biobase_2.46.0
## [15] GenomicRanges_1.38.0 GenomeInfoDb_1.22.1
## [17] IRanges_2.20.2 S4Vectors_0.24.4
## [19] BiocGenerics_0.32.0
##
## loaded via a namespace (and not attached):
## [1] colorspace_1.4-1 ggsignif_0.6.0 ellipsis_0.3.0
## [4] rio_0.5.16 htmlTable_1.13.3 XVector_0.26.0
## [7] base64enc_0.1-3 rstudioapi_0.11 hexbin_1.27.3
## [10] farver_2.0.3 affyio_1.56.0 bit64_0.9-7
## [13] AnnotationDbi_1.48.0 splines_3.6.1 geneplotter_1.64.0
## [16] knitr_1.28 jsonlite_1.6 Formula_1.2-3
## [19] broom_0.5.6 annotate_1.64.0 cluster_2.1.0
## [22] shiny_1.3.2 BiocManager_1.30.10 compiler_3.6.1
## [25] backports_1.1.7 assertthat_0.2.1 Matrix_1.2-18
## [28] limma_3.42.2 later_0.8.0 acepack_1.4.1
## [31] htmltools_0.3.6 tools_3.6.1 gtable_0.3.0
## [34] glue_1.4.1 GenomeInfoDbData_1.2.2 affy_1.64.0
## [37] Rcpp_1.0.4.6 carData_3.0-3 cellranger_1.1.0
## [40] vctrs_0.3.0 preprocessCore_1.48.0 nlme_3.1-140
## [43] crosstalk_1.0.0 xfun_0.13 stringr_1.4.0
## [46] openxlsx_4.1.5 mime_0.7 lifecycle_0.2.0
## [49] rstatix_0.5.0 XML_3.99-0.3 zlibbioc_1.32.0
## [52] scales_1.1.1 promises_1.0.1 hms_0.5.3
## [55] yaml_2.2.0 curl_3.3 memoise_1.1.0
## [58] gridExtra_2.3 rpart_4.1-15 latticeExtra_0.6-28
## [61] stringi_1.4.6 RSQLite_2.2.0 genefilter_1.68.0
## [64] checkmate_2.0.0 zip_2.0.4 rlang_0.4.6
## [67] pkgconfig_2.0.3 bitops_1.0-6 evaluate_0.14
## [70] lattice_0.20-38 purrr_0.3.2 labeling_0.3
## [73] htmlwidgets_1.3 bit_1.1-15.2 tidyselect_1.1.0
## [76] R6_2.4.1 generics_0.0.2 Hmisc_4.2-0
## [79] DBI_1.1.0 pillar_1.4.4 haven_2.2.0
## [82] foreign_0.8-71 withr_2.2.0 survival_3.1-12
## [85] abind_1.4-5 RCurl_1.98-1.2 nnet_7.3-12
## [88] tibble_3.0.1 crayon_1.3.4 car_3.0-7
## [91] rmarkdown_1.13 locfit_1.5-9.4 grid_3.6.1
## [94] readxl_1.3.1 data.table_1.12.8 blob_1.2.1
## [97] forcats_0.5.0 digest_0.6.25 xtable_1.8-4
## [100] tidyr_1.0.3 httpuv_1.5.1 munsell_0.5.0